#include "mpi.h"
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>

/*	Calcs bandwidth and/or latency between couples of processes */

#define DEF_BW_SIZE (1024*1024)	// 4M memory allocation
#define MAX_BW_SIZE (4*DEF_BW_SIZE)
#define MYBUFSIZE ((4*DEF_BW_SIZE)+4096)	// 4M memory allocation
#define MAX_REQ_NUM 100000
#define SIZE_MULT 4
#define DEF_LOOP 1000
#define DEF_LT_SIZE 1		// 1M
#define OUT_PER_RANK 1

MPI_Request request[MAX_REQ_NUM];
MPI_Request rcv_request[MAX_REQ_NUM];	// used only for rcv requests in bidirectional bw
MPI_Status rcv_stat[MAX_REQ_NUM];

int
usage (int myid)
{
    if (myid == 0)
	{
	    fprintf (stderr,
		     "\nUsage: mpi_p -l [loops] -s [size] -t [type] -c [cycle] -order [0|1] -output [0|1] -warmup [0|1] (All arguments are optional!)\n\n"
		     "\tloops  - the number of times to transfer the data, -1 for endless run. (default: 1000)\n"
		     "\tsize   - the number of bytes to transfer, -1 will run from 1 to 4M. (default: 1048576)\n"
		     "\ttype   - bw|bibw|lt|all - run bandwidth, bidirectional bandwidth, latency or all. (default: all)\n"
		     "\tcycle  - number of sequential data transfer itterations. (default: 64)\n"
		     "\torder  - 0|1 - senders are: even ranks (i -> i+1), or - "
		     "first half of ranks (i -> i+np/2) respectively. (default: 0)\n"
		     "\toutput - 0|1 - outputs results per sender rank in addition to min, max, avg. (default: 0 - no output per sender)\n"
		     "\twarmup - 0|1 - with or without warmup before test. (default: 1 - with warmup)\nExample:\n"
		     "\t/usr/voltaire/mpi/bin/mpirun_ssh -np 2 -hostfile host_file /usr/voltaire/mpi/bin/mpi_p -l 100 -s 5000 -t bw -order 1\n");
	}
    MPI_Finalize ();
    return 0;
}



int
main (int argc, char *argv[])
{
    int myid, numprocs, midd, remote, i, j, recive_rank;	// handles processes
    int loop, size_bw, size_lt, size;	// original test params
    int output = 0;		// for 1 will print results per rank
    int order, sender = 0;	// hosts coupeling
    int test_bw = 0, test_lt = 0, test_bibw = 0, warmup = 1;	// tests flags
    char test_type[10];		// test type
    char *s_buf, *r_buf;	// send receive buffers
    double t_start = 0.0, t_end = 0.0, t = 0.0;	// time
    double m_bytes, bandwidth, latency;	// holds test result at sender proc
    double *param_bw, *param_lt;	// holds the gathered results at root
    double sum_bw, min_bw, max_bw;	// holds bw tests summaries
    double sum_lt, min_lt, max_lt;	// holds lt tests summaries
    int bw_s_loop = 0, bibw_s_loop = 0, lt_s_loop = 0;	// msg sizes loop
    int skip = 10;		// warmup iterations in latency test, in addition to loop size
    int param_limit = 0, param_skip = 1;	// used at the loop that summarize all tests result,
    int loop_size;
    int cycle = 64;
    int reminder;
    int endless_count = 1;
    //  will consider results from sender nodes onlyi
    int endless = 0;
    int test_lt_orig;
    int test_bw_orig;
    int test_bibw_orig;
    int size_bw_orig;
    int size_lt_orig;
    int bw_s_loop_orig;
    int bibw_s_loop_orig;
    int lt_s_loop_orig;
    int size_orig;
    int loop_orig;

    MPI_Status lt_rcv_stat;
    MPI_Init (&argc, &argv);
    MPI_Comm_size (MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank (MPI_COMM_WORLD, &myid);

    midd = numprocs / 2;

    if (numprocs % 2 != 0)
	{
	    if (myid == 0)
		fprintf (stderr,
			 "\nUsage: only even number of processors allowed\n");
	    MPI_Finalize ();
	    return 0;
	}

    /*set defaults */
    loop = DEF_LOOP;
    size = size_bw = DEF_BW_SIZE;
    size_lt = DEF_LT_SIZE;
    strcpy (test_type, "all");    
    order = 0;
    output = 0;
    warmup = 1;

    /*parse input arguments */
    if (argc == 2)
	{
	    if (!strcmp (argv[1], "-h") || !strcmp (argv[1], "--help"))
		{             
		    return usage (myid);
		}
	}
    for (i = 1; i < argc - 1; i = i + 2)
	{
	    if (!strcmp (argv[i], "-l"))
		{
		    loop = atoi (argv[i + 1]);
		    if (loop == -1)
			{
			    endless = 1;
			    loop = DEF_LOOP;
			}
		}
	    else if (!strcmp (argv[i], "-s"))
		{
		    size = atoi (argv[i + 1]);
		    size_lt = size_bw = size;
		}
	    else if (!strcmp (argv[i], "-t"))
		{
		    strcpy (test_type, argv[i + 1]);
		}
	    else if (!strcmp (argv[i], "-c"))
		{
		    cycle = atoi (argv[i + 1]);
		}
	    else if (!strcmp (argv[i], "-order"))
		{
		    order = atoi (argv[i + 1]);
		}
	    else if (!strcmp (argv[i], "-output"))
		{
		    output = atoi (argv[i + 1]);
		}
	    else if (!strcmp (argv[i], "-warmup"))
		{
		    warmup = atoi (argv[i + 1]);
		}
	    else
		{            
		    return usage (myid);
		}

	}

    /*check input is legal */
    if ( (size <= 0 && size != -1) || (loop <= 0 && loop != -1) || cycle <= 0)
	{
	    fprintf (stderr, "Illegal input\n");
	    return usage (myid);
	}
    if (order != 1)
	{
	    order = 0;		// default remote is myid+1 (for even nodes)
	}
    if (size_bw > MAX_BW_SIZE || size_lt > MAX_BW_SIZE)
	{
	    fprintf (stderr, "Maximum message size is %d\n", MAX_BW_SIZE);
	    MPI_Finalize ();
	    return 0;
	}
    if (loop > MAX_REQ_NUM)
	{
	    fprintf (stderr, "Maximum number of iterations is %d\n",
		     MAX_REQ_NUM);
	    MPI_Finalize ();
	    return 0;
	}
    if (size_bw == -1)
	{
	    bw_s_loop = bibw_s_loop = lt_s_loop = 1;
	    size_bw = size_lt = DEF_BW_SIZE;
	}

    // determine test type
    if (strcmp (test_type, "all") == 0)
	{
	    test_bw = 1;
	    test_lt = 1;
	    test_bibw = 1;
	}
    else if (strcmp (test_type, "bw") == 0)
	{
	    test_bw = 1;
	    test_lt = 0;
	    test_bibw = 0;
	}
    else if (strcmp (test_type, "lt") == 0)
	{
	    test_lt = 1;
	    test_bw = 0;
	    test_bibw = 0;
	}
    else if (strcmp (test_type, "bibw") == 0)
	{
	    test_lt = 0;
	    test_bw = 0;
	    test_bibw = 1;
	}
    else
	{
	    return usage (myid);
	}

    loop_size = loop / cycle;
    reminder = loop % cycle;

    // memory allocation (allocate maximum size needed for both tests)
    if (size_bw > size_lt)
	{
	    size = size_bw;
	}
    else
	{
	    size = size_lt;
	}

    s_buf = (char *) malloc (MYBUFSIZE * sizeof (char));
    assert (s_buf);

    r_buf = (char *) malloc (MYBUFSIZE * sizeof (char));
    assert (r_buf);

    //memory allocation for the results array
    param_bw = (double *) malloc (numprocs * sizeof (double));
    param_lt = (double *) malloc (numprocs * sizeof (double));

    if (s_buf == NULL || r_buf == NULL || param_bw == NULL
	|| param_lt == NULL)
	{
	    fprintf (stderr, "Memory allocation error\n");
	    return 0;
	}

    if (endless)
	{
	    test_lt_orig = test_lt;
	    test_bw_orig = test_bw;
	    test_bibw_orig = test_bibw;
	    size_bw_orig = size_bw;
	    size_lt_orig = size_lt;
	    bw_s_loop_orig = bw_s_loop;
	    lt_s_loop_orig = lt_s_loop;
	    size_orig = size;
	    loop_orig = loop;
	}
  do_for_ever:
    if (endless)
	{
	    test_lt = test_lt_orig;
	    test_bw = test_bw_orig;
	    test_bibw = test_bibw_orig;
	    size_bw = size_bw_orig;
	    size_lt = size_lt_orig;
	    bw_s_loop = bw_s_loop_orig;
	    lt_s_loop = lt_s_loop_orig;
	    size = size_orig;
	    loop = loop_orig;
	}

    MPI_Barrier (MPI_COMM_WORLD);

    // determine which procs will be the senders (even nodes or first half of nodes, according to order arg.)
    //      and see who their talking with (remote)
    if (order == 0)
	{
	    param_limit = numprocs;
	    param_skip = 2;
	    if (myid % 2 == 0)
		{
		    remote = myid + 1;
		    sender = 1;
		}
	    else
		{
		    remote = myid - 1;
		}
	}
    else
	{
	    param_limit = numprocs / 2;
	    param_skip = 1;
	    if (myid < midd)
		{
		    remote = myid + midd;
		    sender = 1;
		}
	    else
		{
		    remote = myid - midd;
		}
	}

    if (test_bw)
	{
	    if (bw_s_loop)
		{
		    size_bw = 1;
		}

	    // start bandwidth test (first try is for warmup)
	    do
		{
		    /* warmup */
		    if (warmup)
			{
			    /*       if (myid==0) fprintf(stderr,"[%d]warming up\n",myid); */
			    if (sender)
				{
				    for (j = 0, cycle = 64; j <= loop_size;
					 j++)
					{
					    if (j == loop_size)
						cycle = reminder;
					    for (i = 0; i < cycle; i++)
						{
						    MPI_Isend (s_buf, size_bw,
							       MPI_CHAR,
							       remote, 100,
							       MPI_COMM_WORLD,
							       request + i);
						}
					    MPI_Waitall (cycle, request,
							 rcv_stat);
					    MPI_Recv (r_buf, 4, MPI_CHAR,
						      remote, 101,
						      MPI_COMM_WORLD,
						      &rcv_stat[0]);
					}
				}
			    else
				{
				    for (j = 0, cycle = 64; j <= loop_size;
					 j++)
					{
					    if (j == loop_size)
						cycle = reminder;
					    for (i = 0; i < cycle; i++)
						{
						    MPI_Irecv (r_buf, size_bw,
							       MPI_CHAR,
							       remote, 100,
							       MPI_COMM_WORLD,
							       request + i);
						}
					    MPI_Waitall (cycle, request,
							 rcv_stat);
					    MPI_Send (s_buf, 4, MPI_CHAR,
						      remote, 101,
						      MPI_COMM_WORLD);
					}
				}
			    MPI_Barrier (MPI_COMM_WORLD);
			}
		    /* real test */
		    if (sender)
			{
			    t_start = MPI_Wtime ();

			    for (j = 0, cycle = 64; j <= loop_size; j++)
				{
				    if (j == loop_size)
					cycle = reminder;
				    for (i = 0; i < cycle; i++)
					{
					    MPI_Isend (s_buf, size_bw,
						       MPI_CHAR, remote, 100,
						       MPI_COMM_WORLD,
						       request + i);
					}
				    MPI_Waitall (cycle, request, rcv_stat);
				    MPI_Recv (r_buf, 4, MPI_CHAR, remote, 101,
					      MPI_COMM_WORLD, &rcv_stat[0]);
				}

			    t_end = MPI_Wtime ();
			    t = t_end - t_start;
			}
		    else
			{
			    for (j = 0, cycle = 64; j <= loop_size; j++)
				{
				    if (j == loop_size)
					cycle = reminder;
				    for (i = 0; i < cycle; i++)
					{
					    MPI_Irecv (r_buf, size_bw,
						       MPI_CHAR, remote, 100,
						       MPI_COMM_WORLD,
						       request + i);
					}
				    MPI_Waitall (cycle, request, rcv_stat);
				    MPI_Send (s_buf, 4, MPI_CHAR, remote, 101,
					      MPI_COMM_WORLD);
				}
			}

		    // calculate result (mbytes/sec)
		    if (sender)
			{
			    m_bytes = ((size_bw * 1.0) / 1.0e6) * loop;
			    bandwidth = m_bytes / t;
			}
		    MPI_Barrier (MPI_COMM_WORLD);

		    // gather all results in rank 0
		    MPI_Gather (&bandwidth, 1, MPI_DOUBLE, param_bw, 1,
				MPI_DOUBLE, 0, MPI_COMM_WORLD);

		    if (myid == 0)
			{
			    sum_bw = 0;
			    max_bw = 0;
			    min_bw = param_bw[0];
			    for (i = 0; i < param_limit; i += param_skip)
				{
				    if (output == OUT_PER_RANK)
					{
					    if (order == 0)
						{
						    recive_rank = i + 1;
						}
					    else
						{
						    recive_rank = i + midd;
						}
					    fprintf (stdout,
						     "bw: [%d]->[%d]: %d\t%f\n",
						     i, recive_rank, size_bw,
						     param_bw[i]);
					}
				    sum_bw += param_bw[i];
				    if (param_bw[i] < min_bw)
					{
					    min_bw = param_bw[i];
					}
				    if (param_bw[i] > max_bw)
					{
					    max_bw = param_bw[i];
					}
				}
			    // print summary
			    fprintf (stdout,
				     "BW (%d) (size min max avg)  %d\t%f\t%f\t%f\n",
				     numprocs, size_bw, min_bw, max_bw,
				     sum_bw / midd);
			}
		    MPI_Barrier (MPI_COMM_WORLD);

		    if (bw_s_loop)
			{
			    size_bw *= SIZE_MULT;
			}
		    if (size_bw > MAX_BW_SIZE)
			{
			    bw_s_loop = 0;
			}
		}
	    while (bw_s_loop);	// end of sizes loop
	}			// end of bandwidth test
    size_bw = size;		//return to user defined size

    if (test_bibw)
	{
	    if (bibw_s_loop)
		{
		    size_bw = 1;
		}

	    // start bibandwidth test (first try is for warmup)
	    do
		{
		    /* warmup */
		    if (warmup)
			{
			    /*       if (myid==0) fprintf(stderr,"[%d]warming up\n",myid); */
			    if (sender)
				{
				    for (j = 0, cycle = 64; j <= loop_size;
					 j++)
					{
					    if (j == loop_size)
						cycle = reminder;
					    for (i = 0; i < cycle; i++)
						{
						    MPI_Isend (s_buf, size_bw,
							       MPI_CHAR,
							       remote, 100,
							       MPI_COMM_WORLD,
							       request + i);
						    MPI_Irecv (r_buf, size_bw,
							       MPI_CHAR,
							       remote, 101,
							       MPI_COMM_WORLD,
							       rcv_request +
							       i);
						}
					    MPI_Waitall (cycle, request,
							 rcv_stat);
					    MPI_Waitall (cycle, rcv_request,
							 rcv_stat);
					    MPI_Recv (r_buf, 4, MPI_CHAR,
						      remote, 102,
						      MPI_COMM_WORLD,
						      &rcv_stat[0]);
					}
				}
			    else
				{
				    for (j = 0, cycle = 64; j <= loop_size;
					 j++)
					{
					    if (j == loop_size)
						cycle = reminder;
					    for (i = 0; i < cycle; i++)
						{
						    MPI_Isend (s_buf, size_bw,
							       MPI_CHAR,
							       remote, 101,
							       MPI_COMM_WORLD,
							       request + i);
						    MPI_Irecv (r_buf, size_bw,
							       MPI_CHAR,
							       remote, 100,
							       MPI_COMM_WORLD,
							       rcv_request +
							       i);
						}
					    MPI_Waitall (cycle, request,
							 rcv_stat);
					    MPI_Waitall (cycle, rcv_request,
							 rcv_stat);
					    MPI_Send (s_buf, 4, MPI_CHAR,
						      remote, 102,
						      MPI_COMM_WORLD);
					}
				}
			    MPI_Barrier (MPI_COMM_WORLD);
			}
		    /* real test */
		    if (sender)
			{
			    t_start = MPI_Wtime ();

			    for (j = 0, cycle = 64; j <= loop_size; j++)
				{
				    if (j == loop_size)
					cycle = reminder;
				    for (i = 0; i < cycle; i++)
					{
					    MPI_Isend (s_buf, size_bw,
						       MPI_CHAR, remote, 100,
						       MPI_COMM_WORLD,
						       request + i);
					    MPI_Irecv (r_buf, size_bw,
						       MPI_CHAR, remote, 101,
						       MPI_COMM_WORLD,
						       rcv_request + i);
					}
				    MPI_Waitall (cycle, request, rcv_stat);
				    MPI_Waitall (cycle, rcv_request,
						 rcv_stat);
				    MPI_Recv (r_buf, 4, MPI_CHAR, remote, 102,
					      MPI_COMM_WORLD, &rcv_stat[0]);
				}

			    t_end = MPI_Wtime ();
			    t = t_end - t_start;
			}
		    else
			{
			    for (j = 0, cycle = 64; j <= loop_size; j++)
				{
				    if (j == loop_size)
					cycle = reminder;
				    for (i = 0; i < cycle; i++)
					{
					    MPI_Isend (s_buf, size_bw,
						       MPI_CHAR, remote, 101,
						       MPI_COMM_WORLD,
						       request + i);
					    MPI_Irecv (r_buf, size_bw,
						       MPI_CHAR, remote, 100,
						       MPI_COMM_WORLD,
						       rcv_request + i);
					}
				    MPI_Waitall (cycle, request, rcv_stat);
				    MPI_Waitall (cycle, rcv_request,
						 rcv_stat);
				    MPI_Send (s_buf, 4, MPI_CHAR, remote, 102,
					      MPI_COMM_WORLD);
				}
			}
		    // calculate result (mbytes/sec)
		    if (sender)
			{
			    m_bytes = ((size_bw * 1.0) / 1.0e6) * loop * 2;	//multiply by 2 for biderctional bw
			    bandwidth = m_bytes / t;
			}
		    MPI_Barrier (MPI_COMM_WORLD);

		    // gather all results in rank 0
		    MPI_Gather (&bandwidth, 1, MPI_DOUBLE, param_bw, 1,
				MPI_DOUBLE, 0, MPI_COMM_WORLD);

		    if (myid == 0)
			{
			    sum_bw = 0;
			    max_bw = 0;
			    min_bw = param_bw[0];
			    for (i = 0; i < param_limit; i += param_skip)
				{
				    if (output == OUT_PER_RANK)
					{
					    if (order == 0)
						{
						    recive_rank = i + 1;
						}
					    else
						{
						    recive_rank = i + midd;
						}
					    fprintf (stdout,
						     "bw: [%d]->[%d]: %d\t%f\n",
						     i, recive_rank, size_bw,
						     param_bw[i]);
					}
				    sum_bw += param_bw[i];
				    if (param_bw[i] < min_bw)
					{
					    min_bw = param_bw[i];
					}
				    if (param_bw[i] > max_bw)
					{
					    max_bw = param_bw[i];
					}
				}
			    // print summary
			    fprintf (stdout,
				     "BI-BW (%d) (size min max avg)  %d\t%f\t%f\t%f\n",
				     numprocs, size_bw, min_bw, max_bw,
				     sum_bw / midd);
			}
		    MPI_Barrier (MPI_COMM_WORLD);

		    if (bibw_s_loop)
			{
			    size_bw *= SIZE_MULT;
			}
		    if (size_bw > MAX_BW_SIZE)
			{
			    bibw_s_loop = 0;
			}
		}
	    while (bibw_s_loop);	// end of sizes loop
	}			// end of biderctional bandwidth test

    if (test_lt)
	{

	    if (lt_s_loop)
		{
		    size_lt = 1;
		}

	    // start latency test (skip some iterations at start)
	    do
		{
		    if (sender)
			{	//run warmup
			    for (i = 0; i < skip; i++)
				{
				    MPI_Send (s_buf, size_lt, MPI_CHAR,
					      remote, i, MPI_COMM_WORLD);
				    MPI_Recv (r_buf, size_lt, MPI_CHAR,
					      remote, i + 1000,
					      MPI_COMM_WORLD, &lt_rcv_stat);
				}

			    t_start = MPI_Wtime ();
			    for (j = 0, cycle = 64; j <= loop_size; j++)
				{
				    if (j == loop_size)
					cycle = reminder;
				    for (i = 0; i < cycle; i++)
					{
					    MPI_Send (s_buf, size_lt,
						      MPI_CHAR, remote, i,
						      MPI_COMM_WORLD);
					    MPI_Recv (r_buf, size_lt,
						      MPI_CHAR, remote,
						      i + 1000,
						      MPI_COMM_WORLD,
						      &lt_rcv_stat);
					}
				}
			    t_end = MPI_Wtime ();
			}
		    else
			{	//run warmup
			    for (i = 0; i < skip; i++)
				{
				    MPI_Recv (r_buf, size_lt, MPI_CHAR,
					      remote, i, MPI_COMM_WORLD,
					      &lt_rcv_stat);
				    MPI_Send (s_buf, size_lt, MPI_CHAR,
					      remote, i + 1000,
					      MPI_COMM_WORLD);
				}

			    for (j = 0, cycle = 64; j <= loop_size; j++)
				{
				    if (j == loop_size)
					cycle = reminder;
				    for (i = 0; i < cycle; i++)
					{
					    MPI_Recv (r_buf, size_lt,
						      MPI_CHAR, remote, i,
						      MPI_COMM_WORLD,
						      &lt_rcv_stat);
					    MPI_Send (s_buf, size_lt,
						      MPI_CHAR, remote,
						      i + 1000,
						      MPI_COMM_WORLD);
					}
				}
			}

		    // calculate result (time in usec)
		    if (sender)
			{
			    latency =
				(t_end - t_start) * 1.0e6 / (2.0 * loop);
			}
		    MPI_Barrier (MPI_COMM_WORLD);

		    // gather all results in rank 0
		    MPI_Gather (&latency, 1, MPI_DOUBLE, param_lt, 1,
				MPI_DOUBLE, 0, MPI_COMM_WORLD);

		    if (myid == 0)
			{
			    sum_lt = 0;
			    max_lt = 0;
			    min_lt = param_lt[0];
			    for (i = 0; i < param_limit; i += param_skip)
				{
				    if (output == OUT_PER_RANK)
					{
					    if (order == 0)
						{
						    recive_rank = i + 1;
						}
					    else
						{
						    recive_rank = i + midd;
						}
					    fprintf (stdout,
						     "LT: [%d]->[%d]: %d\t%f\t\n",
						     i, recive_rank, size_lt,
						     param_lt[i]);
					}
				    sum_lt += param_lt[i];
				    if (param_lt[i] < min_lt)
					{
					    min_lt = param_lt[i];
					}
				    if (param_lt[i] > max_lt)
					{
					    max_lt = param_lt[i];
					}
				}

			    // print summary
			    fprintf (stdout,
				     "LT (%d) (size min max avg)  %d\t%f\t%f\t%f\n",
				     numprocs, size_lt, min_lt, max_lt,
				     sum_lt / (numprocs / 2));
			}
		    MPI_Barrier (MPI_COMM_WORLD);

		    if (lt_s_loop)
			{
			    size_lt *= SIZE_MULT;
			}
		    if (size_lt >= MAX_BW_SIZE)
			{
			    lt_s_loop = 0;
			}
		}
	    while (lt_s_loop);	// end of sizes loop
	}			// end of latency test


    if (endless)
	{
	    if (myid == 0)
		{
		    endless_count++;		    
		    fprintf (stdout, "loop number: %d\n", endless_count);
		}
	    goto do_for_ever;
	}


    MPI_Barrier (MPI_COMM_WORLD);
    MPI_Finalize ();
    free (s_buf);
    free (r_buf);
    free (param_bw);
    free (param_lt);
    return 0;
}
